knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, cache = FALSE, fig.show = "asis",
fig.align = "center")
options(scipen=999)
library(sf)
library(ggplot2)
library(tidycensus)
library(tidyverse)
library(ggtext)
library(glue)
library(leaflet)
library(mapview)
library(patchwork)
library(lwgeom)
library(FNN)
library(kableExtra)
library(corrplot)
library(htmltools) # For div() function
library(knitr) # insert images
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")
# Change to your own workspace
# knitr::opts_knit$set(root.dir = "E:/Spring/Practicum/DataAnalysis/Chinatown") # Luming
# wd <- "E:/Spring/Practicum/DataAnalysis/Chinatown"
# Aki
knitr::opts_knit$set(root.dir = "~/Documents/MUSA/Spring25/MUSA8010_Practicum/Chinatown/")
wd <- "~/Documents/MUSA/Spring25/MUSA8010_Practicum/Chinatown/"
nhoods_4326 <-
# st_read("DataWrangling/data/philadelphia-neighborhoods.geojson") %>% # Luming
st_read("data/philadelphia-neighborhoods.geojson") %>% # Aki
st_transform('EPSG:4326')
# philly properties (from Sijia)
# from the last 5 years, inflation-adjusted prices, total area > 100 sq ft, prices > 30,000, outliers above 5 SD from the mean were removed
philly_properties <- st_read("data/property_philly_cleaned.geojson") # Aki
# philly_properties <- st_read("DataWrangling/data/property_philly_cleaned.geojson") # Luming
It is hard to talk about the history of the Philadelphia Chinatown without the construction of I-676, also known as the Vine Street Expressway. Although the highway serves as a significant thoroughfare through Center City Philadelphia, the indispensable repercussion on the adjacent neighborhoods still lingers today. Consequences of the highway such as demolishment of buildings, pedestrian safety threats, traffic congestion, as well as noise and air pollutions had been harming Chinatown and its surrounding neighborhoods over the past decades and are yet to be addressed.
Standing in opposition to the issues, City of Philadelphia Office of Transportation and Infrastructure and Sustainability (OTIS) brought forth the plan of capping sections of the expressway as a solution to address its historical harms on the Chinatown region. This “cap”, named the Chinatown Stitch project, aimed to create a safe and equitable green space for the public that would reconnect areas north and south to the Vine Street expressway. Receiving funds from the US department of Transportation and the local organizations, the construction of the Chinatown Stitch will start in 2027 and is expected to be completed in early 2030.
This purpose of this study is to help OTIS understand the housing market of Philadelphia Chinatown region before and after the implementation of the Chinatown Stitch. The first part of the study will focus the effects of highways on sales prices in the study region as well as the entire Philadelphia. House prices will then be estimated using three predictive models under three different future scenarios: business as usual without the implementation of the Chinatown Stitch, business as usual with the implementation of the Chinatown Stitch, and an alternative future in which the Chinatown Stitch significantly reshapes the urban landscape of the study region.
Philly_blockgroup <- st_read("Dataset/Philly_blockgroup/Philly_blockgroup.shp") %>%
st_transform('EPSG:2272')
# philly bounds
philly_bounds <- st_union(Philly_blockgroup)
studyarea <- st_read("Dataset/studyarea/StudyArea.shp") %>%
st_transform('EPSG:2272')
studyarea_blockgroup <- st_read("Dataset/studyarea/StudyArea_blockgroup_tract.shp") %>%
st_transform('EPSG:2272')
studyarea_north <- st_read("Dataset/studyarea-sub/studyarea_north.shp") %>%
st_transform('EPSG:2272')
studyarea_south <- st_read("Dataset/studyarea-sub/studyarea_south.shp") %>%
st_transform('EPSG:2272')
I_676 <- st_read("Dataset/Highways/I_676.shp") %>%
st_transform('EPSG:2272')
discontinuity <- st_read("Dataset/studyarea-sub/discontinuity.shp") %>%
st_transform('EPSG:2272')
Chinatown_Stitch <- st_read("Dataset/Chinatown_Stitch/Chinatown_Stitch.shp") %>%
st_transform('EPSG:2272')
property_original <- st_read("Dataset/original_property_studyarea.geojson") %>%
st_transform('EPSG:2272')
property_north <- property_original %>%
st_intersection(studyarea_north)
property_south <- property_original %>%
st_intersection(studyarea_south)
# philly hydrology for mapping (bounded by philly_bounds; source: https://opendataphilly.org/datasets/hydrology/)
hydro <- st_read("https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Hydrographic_Features_Poly/FeatureServer/1/query?outFields=*&where=1%3D1&f=geojson") %>%
st_transform(crs = 'EPSG:2272') %>%
st_intersection(philly_bounds)
philly_highways <- st_read("Dataset/Highways/Casestudy_Highways.shp") %>%
st_transform('EPSG:2272')
Three study areas are used in the analysis to reveal the effects of I-676 on the land market of the Chinatown region.
StudyArea_plot1 <- ggplot() +
geom_sf(data = Philly_blockgroup, fill = "transparent", color = "grey90", linewidth = .2) +
geom_sf(data = studyarea, fill = "#eb5600", color = "#eb5600") +
geom_sf(data = hydro, fill = "#DFF3E3", color = "transparent") +
geom_sf(data = philly_bounds, fill = "transparent", color = "grey60", linewidth = 1, linetype = "dashed") +
theme_void() +
labs(title = "Three Study Areas:",
subtitle = "Philadelphia | 10 Block Groups around the Chinatown Stitch | North and South Sides",
caption = "Source: ACS Census Data and OPA Data.") +
theme(plot.title = element_text(size = 16, face = "bold",margin = margin(b = 6)),
plot.subtitle = element_text(margin = margin(b = 10)),
plot.caption = element_text(hjust = 0, margin = margin(t = 10)))
StudyArea_plot2 <- ggplot() +
geom_sf(data = studyarea_north, fill = alpha("#eb5600", .1), color = "transparent") +
geom_sf(data = studyarea_south, fill = alpha("#1a9988", .1), color = "transparent") +
# geom_sf(data = discontinuity, fill = "black", color = "transparent") +
geom_sf(data = Chinatown_Stitch, fill = "grey", color = "transparent", linetype = "dashed", linewidth = 1) +
geom_sf(data = studyarea_blockgroup, fill = "transparent", color = "grey", linewidth = .8, linetype = "dashed") +
geom_sf(data = property_north, color = "#eb5600", size = .6) +
geom_sf(data = property_south, color = "#1a9988", size = .6) +
geom_sf(data = studyarea, fill = "transparent", color = "grey60", linewidth = 1.2, linetype = "dashed") +
theme_void()
StudyArea_plot1 | StudyArea_plot2
studyarea_4326 <- st_transform(studyarea, crs = 4326)
bbox <- as.list(st_bbox(studyarea_4326))
ChinatownStitch_4326 <- st_read("Dataset/Chinatown_Stitch/Chinatown_Stitch.shp") %>%
st_transform('EPSG:4326')
I676_4326 <- st_transform(I_676, crs = 4326)
landuse_4326 <- st_read("Dataset/landuse_clip/Land_Use_ClipLayer.shp") %>%
st_transform('EPSG:4326')
landuse_4326_plot <- landuse_4326 %>%
filter(C_DIG2DESC != 51 & C_DIG2DESC != 71)
landuse_park_ing <- landuse_4326 %>%
filter(OBJECTID_1 == 514214)
park_4326 <- st_read("Dataset/ParksModified/Parks_within_1km.shp") %>%
st_transform('EPSG:4326')
# nhoods_4326 <-
# st_read("DataWrangling/data/philadelphia-neighborhoods.geojson") %>%
# # st_read("data/philadelphia-neighborhoods.geojson") %>%
# st_transform('EPSG:4326')
Chinatown_Callowhill <- nhoods_4326 %>%
filter(NAME %in% c("CHINATOWN", "CALLOWHILL"))
In the vision of a city park in the future, the Chinatown Stitch will bring vitality to local residents and visitors along with other green spaces. Connected with the Stitch, the Rail Park in the north (Callowhill neighborhood) is still in construction and will make an impact years later. With a foundation of cultural and commercial resources, the southern part (Chinatown neighborhood) will provide services to the surrounding area, including the city center of Philadelphia.
UseCase <- leaflet() %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addPolygons(data = studyarea_4326,
color = "black", # Black border
weight = 4, # Border thickness
dashArray = "8,12", # Dashed line pattern (5px on, 5px off)
fill = FALSE) %>%
addPolygons(data = landuse_4326_plot,
color = "grey",
weight = 0.8,
fillColor = "#DFF3E3",
fillOpacity = 0.4) %>%
addPolygons(data = Chinatown_Callowhill,
color = "#eb5600",
weight = 1,
opacity = 0.5,
dashArray = "5,5",
fillColor = "#eb5600",
fillOpacity = 0.1) %>%
addPolygons(data = park_4326,
color = "#1a9988",
weight = 1,
fillOpacity = 0.6,
fillColor = "#1a9988") %>%
addPolygons(data = landuse_park_ing,
color = "#1a9988",
weight = 2,
opacity = 0.8,
fill = FALSE) %>%
addPolylines(data = I676_4326,
color = "#eb5600",
opacity = 1,
weight = 2) %>%
addPolygons(data = ChinatownStitch_4326,
color = "#eb5600",
weight = 2,
opacity = 1,
fillColor = "#1a9988",
fillOpacity = 0.8
# fill = alpha("#1a9988", 0.5)
) %>%
fitBounds(
lng1 = bbox$xmin,
lat1 = bbox$ymin,
lng2 = bbox$xmax,
lat2 = bbox$ymax
)
div(class = "center-leaflet", UseCase)
|
|
|
| Callowhill | Chinatown |
To achieve the goal of a data-rich story map, the methodology is structured around a robust model construction process. By incorporating illustrative data visualizations throughout the data analysis, the story map features engaging maps and compelling narratives.
TODO: include description of qualitative analysis
# property <- st_read("DataWrangling/data/opa_properties_public.geojson") %>%
# st_transform('EPSG:2272')
#
# property_studyarea <- st_intersection(property, studyarea)
#
# st_write(property_studyarea %>% st_transform('EPSG:3857'), "Dataset/original_property_studyarea.geojson", driver = "GeoJSON")
property_studyarea <- st_read("Dataset/original_property_studyarea.geojson") %>%
st_transform('EPSG: 2272')
landuse <- st_read("Dataset/landuse_clip/Land_Use_ClipLayer.shp") %>%
st_transform('EPSG:2272')
landuse_rename <- landuse %>%
mutate(landuse =
case_when(
C_DIG2DESC == 11 ~ "Residential Low",
C_DIG2DESC == 12 ~ "Residential Medium",
C_DIG2DESC == 13 ~ "Residential High",
C_DIG2DESC == 21 ~ "Commercial Consumer",
C_DIG2DESC == 22 ~ "Commercial Business/Professional",
C_DIG2DESC == 23 ~ "Commercial Mixed Residential",
C_DIG2DESC == 31 ~ "Industrial",
C_DIG2DESC == 41 ~ "Civic/Institution",
C_DIG2DESC == 51 ~ "Transportation",
C_DIG2DESC == 61 ~ "Culture/Amusement",
C_DIG2DESC == 62 ~ "Active Recreation",
C_DIG2DESC == 71 ~ "Park/Open Space",
C_DIG2DESC == 72 ~ "Cemetery",
C_DIG2DESC == 81 ~ "Water",
C_DIG2DESC == 91 ~ "Vacant",
C_DIG2DESC == 92 ~ "Other/Unknown"
))
# philly properties (from Sijia)
# from the last 5 years, inflation-adjusted prices, total area > 100 sq ft, prices > 30,000, outliers above 5 SD from the mean were removed
# philly_properties <- st_read("data/property_philly_250318.geojson")
nhoods <-
st_read("Dataset/DataWrangle/philadelphia-neighborhoods.geojson") %>%
st_transform('EPSG:2272')
park <- st_read("Dataset/ParksModified/Parks_within_1km.shp") %>%
st_transform('EPSG:2272')
# philly park data (https://opendataphilly.org/datasets/ppr-properties/)
philly_parks <- st_read("https://opendata.arcgis.com/datasets/d52445160ab14380a673e5849203eb64_0.geojson") %>%
st_transform(crs = 'EPSG:2272') %>%
st_intersection(philly_bounds)
metro <-
st_read("https://opendata.arcgis.com/api/v3/datasets/af52d74b872045d0abb4a6bbbb249453_0/downloads/data?format=geojson&spatialRefId=4326") %>%
st_transform('EPSG:2272') %>%
mutate(Type = "metro")
city_hall <- metro[metro$Station == 'City Hall', 6]
trolley <-
st_read("https://opendata.arcgis.com/api/v3/datasets/dd2afb618d804100867dfe0669383159_0/downloads/data?format=geojson&spatialRefId=4326") %>%
st_transform('EPSG:2272')
trolley_renamed <- trolley %>%
rename(Station = StopName,
Route = LineAbbr,
Longitude = Lon,
Latitude = Lat) %>%
mutate(Type = "trolley")
# Combine both datasets into one
transit <- bind_rows(metro, trolley_renamed)
school <-
st_read("https://opendata.arcgis.com/datasets/d46a7e59e2c246c891fbee778759717e_0.geojson") %>%
st_transform('EPSG:2272')
hospital <-
st_read("Dataset/DataWrangle/DOH_Hospitals202311.geojson") %>%
st_transform('EPSG:2272') %>%
st_filter(st_union(Philly_blockgroup))
water <-
st_read("https://services.arcgis.com/fLeGjb7u4uXqeF9q/arcgis/rest/services/Hydrographic_Features_Poly/FeatureServer/1/query?outFields=*&where=1%3D1&f=geojson") %>%
st_transform('EPSG:2272')
bike_network <-
st_read("Dataset/DataWrangle/Bike_Network.geojson") %>%
st_transform('EPSG:2272')
phillyCrimes <-
st_read("Dataset/DataWrangle/phillyCrimes.geojson") %>%
st_transform('EPSG:2272')
# LPSS_PER1000: Number of low-produce supply stores per 1,000 people
# HPSS_PER1000: Number of high-produce supply stores per 1,000 people
retail <-
st_read("https://opendata.arcgis.com/datasets/53b8a1c653a74c92b2de23a5d7bf04a0_0.geojson") %>%
st_transform('EPSG:2272')
calculate_nearest_distance <- function(set_points, other_layer) {
nearest_idx <- st_nearest_feature(set_points, other_layer)
st_distance(set_points, other_layer[nearest_idx, ], by_element = TRUE) %>% as.numeric()
}
Luming_property <- property_studyarea %>%
dplyr::select(objectid, geometry, sale_price) %>%
rename(orig_objectid = objectid)
Luming_property <- Luming_property %>%
mutate(distance_to_city_hall = st_distance(., city_hall) %>% as.numeric()) %>%
mutate(
distance_to_nearest_transit = calculate_nearest_distance(geometry, transit),
distance_to_nearest_hospital = calculate_nearest_distance(geometry, hospital),
distance_to_nearest_school = calculate_nearest_distance(geometry, school),
distance_to_nearest_park = calculate_nearest_distance(geometry, park),
distance_to_nearest_water = calculate_nearest_distance(geometry, water),
distance_to_nearest_bikelane = calculate_nearest_distance(geometry, bike_network),
distance_to_I676 = calculate_nearest_distance(geometry, I_676)
) %>%
st_join(nhoods["NAME"], left = TRUE) %>%
rename(nhoods_name = NAME) %>%
st_join(landuse_rename["landuse"], left = TRUE) %>%
st_join(retail[, c("LPSS_PER1000", "HPSS_PER1000")], left = TRUE)
Luming_property <- Luming_property %>%
mutate(
crime_nn1 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 1),
crime_nn2 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 2),
crime_nn3 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 3),
crime_nn4 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 4),
crime_nn5 = nn_function(st_coordinates(Luming_property),
st_coordinates(phillyCrimes), k = 5))
property_EDA <- st_read("Dataset/property_sijia_eda.geojson") %>%
st_transform('EPSG:2272')
property_EDA_update <- property_EDA %>%
dplyr::select(-sale_price.y) %>%
rename(sale_price = sale_price.x) %>%
mutate(log_sale_price = ifelse(!is.na(sale_price) & sale_price == 0,
log(sale_price + 1),
log(sale_price)))
property_EDA_update <-
rbind(
property_EDA_update %>% st_intersection(studyarea_north["geometry"]) %>%
mutate(I676 = "north"),
property_EDA_update %>% st_intersection(studyarea_south["geometry"]) %>%
mutate(I676 = "south")
)
Before jumping into modeling, we first explored the relationship between distance to highway and property prices. We used property data from OPA Philadelphia that was cleaned to include only the data from the last five years (Jan. 1st, 2020 to Dec. 31st, 2024) with prices adjusted for inflation after removing extreme outliers and improbable values, and keeping properties with total area of over 100 square feet. Looking at properties (of all kinds) within 500 m of Phildelphia’s main highways – I-95 on the East, I-76 on the West, US-1 in the North, and I-676 through the city – we saw the following trends in property prices.
buff500 <- 500 * 3.28084
hiway_500buff <- st_union(st_buffer(philly_highways, buff500))
# adding distance to highway and log price variables
# properties500 <- philly_properties[hiway_500buff,] %>%
# mutate(price_perTLA = adjusted_sale_price / total_livable_area,
# price_perTA = adjusted_sale_price / total_area,
# log_price = log(adjusted_sale_price),
# log_price_perTLA = log(price_perTLA),
# log_price_perTA = log(price_perTA),
# dist_to_US001 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0001")),
# dist_to_I076 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0076")),
# dist_to_I095 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0095")),
# dist_to_I676 = calculate_nearest_distance(geometry,
# philly_highways %>%
# filter(ST_RT_NO == "0676")),
# dist_to_US001m = dist_to_US001 * 0.3048,
# dist_to_I076m = dist_to_I076 * 0.3048,
# dist_to_I095m = dist_to_I095 * 0.3048,
# dist_to_I676m = dist_to_I676 * 0.3048)
#
# prop500_closest <- properties500 %>%
# select(matches("objectid|dist_to")) %>%
# pivot_longer(cols = 2:5,
# names_to = "highway",
# values_to = "distance") %>%
# group_by(objectid) %>%
# summarise(dist_to_closest_highway = min(distance, na.rm = T)) %>%
# ungroup() %>%
# mutate(dist_to_closest_highwaym = dist_to_closest_highway * 0.3048)
#
# # join closest highway
# prop500 <- left_join(properties500,
# prop500_closest %>% st_drop_geometry(),
# by = "objectid") %>%
# mutate(closest_highway = case_when(
# dist_to_closest_highway == dist_to_US001 ~ "US-1",
# dist_to_closest_highway == dist_to_I076 ~ "I-76",
# dist_to_closest_highway == dist_to_I095 ~ "I-95",
# dist_to_closest_highway == dist_to_I676 ~ "I-676",
# .default = NA),
# closest_highway = factor(closest_highway, levels = c("US-1", "I-76", "I-676", "I-95")),
# highway_type = case_when(closest_highway %in% c("US-1","I-676") ~ "through neighborhood",
# closest_highway %in% c("I-95","I-76") ~ "along waterway",
# .default = NA))
# save output and read in the prefiltered to save time
# st_write(prop500, "data/philly_prop500_dst2hghwy_250325.geojson")
# read in pre-saved file
prop500 <- st_read("Dataset/PhillyPropertiesNearHighways/philly_prop500_dst2hghwy_250325.geojson") %>%
mutate(closest_highway = factor(closest_highway, levels = c("US-1", "I-76", "I-676", "I-95")))
# map of properties within 500m of highway cored by which highway they're closest to
ggplot() +
geom_sf(data = philly_bounds, fill = "#c9b887", color = "grey") +
geom_sf(data = hydro, fill = "#7d97c7", color = "transparent") +
geom_sf(data = philly_parks, fill = "#6b9169", color = "transparent") +
geom_sf(data = hiway_500buff,
color = "transparent", fill = "grey", alpha = 0.4) +
geom_sf(data = prop500,
aes(color = closest_highway), size = 0.3,
show.legend = F) +
geom_sf(data = philly_highways,
color = "black") +
# add custom colors
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
# labels for highway names
geom_text(aes(label = "US-1",
x = 2700000,
y = 256453.2),
color = "#1a9988") +
geom_text(aes(label = "I-76",
x = 2680000,
y = 235000),
color = "#eb5600") +
geom_text(aes(label = "I-95",
x = 2705200,
y = 235000),
color = "#eba075") +
geom_text(aes(label = "I-676",
x = 2694000,
y = 243000),
color = "#7fe3d6") +
labs(title = "Properties within 500m of Highways in Philly",
subtitle = "US-1, I-76, I-95, and I-676",
color = "Closest Highway") +
theme_void()
prop500_faceted <-
ggplot(data = prop500,
aes(x = log_price_perTA, color = closest_highway, fill = closest_highway)) +
geom_histogram(bins = 50, show.legend = F) +
facet_wrap(~closest_highway, scales = "free_y") +
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
scale_fill_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
labs(title = "Distribution of Property Sale Price",
subtitle = "Split by Closest Highway",
x = "Log Sale Price (per Total Area)",
y = "Count") +
theme_minimal()
prop500_faceted
# median sale prices to talk about in the text
med_price_us1 <- round(median(prop500 %>%
filter(closest_highway == "US-1") %>%
pull(adjusted_sale_price)))
med_price_i76 <- round(median(prop500 %>%
filter(closest_highway == "I-76") %>%
pull(adjusted_sale_price)))
med_price_i95 <- round(median(prop500 %>%
filter(closest_highway == "I-95") %>%
pull(adjusted_sale_price)))
med_price_676 <- round(median(prop500 %>%
filter(closest_highway == "I-676") %>%
pull(adjusted_sale_price)))
We see that the distribution of log sale prices of properties within 500m of highways in Philadelphia generally follow a normal distribution. Though there aren’t as many properties near I-676 as other there are near the other highways, properties are most expensive here with a median property price of $972,362. Properties near I-95 and I-76 are among a similar range of prices with median property prices being $354,750 and $315,203, respectively, though there are almost five times as many properties near I-95 compared to I-76. Finally, there are many properties within 500m of US-1, but the median property price was lowest compared to other highways at $234,847.
# simple scatter of price (as log price per total area in sq ft) and dist2closesthighway
cor_us1 <- round(cor(prop500 %>%
filter(grepl("US", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("US", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i76 <- round(cor(prop500 %>%
filter(grepl("I-76", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-76", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i95 <- round(cor(prop500 %>%
filter(grepl("I-95", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-95", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
cor_i676 <- round(cor(prop500 %>%
filter(grepl("I-676", closest_highway)) %>%
pull(log_price_perTA),
prop500 %>%
filter(grepl("I-676", closest_highway)) %>%
pull(dist_to_closest_highwaym)), 3)
scatter_4highways <-
ggplot(data = prop500,
aes(x = dist_to_closest_highwaym, y = log_price_perTA,
color = closest_highway, fill = closest_highway)) +
geom_point(alpha = .2, size = 0.3, show.legend = F) +
geom_smooth(method = "lm", size = 1.2, show.legend = F) +
scale_color_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
scale_fill_manual(values = c("US-1" = "#1a9988",
"I-76" = "#eb5600",
"I-95" = "#eba075",
"I-676" = "#7fe3d6")) +
geom_text(aes(label = paste("US-1:", cor_us1),
x = 150,
y = 1.25),
color = "#1a9988",
show.legend = F) +
geom_text(aes(label = paste("I-76:", cor_i76),
x = 350,
y = 1.25),
color = "#eb5600",
show.legend = F) +
geom_text(aes(label = paste("I-95:", cor_i95),
x = 350,
y = 8.75),
color = "#eba075",
show.legend = F) +
geom_text(aes(label = paste("I-676:", cor_i676),
x = 150,
y = 8.75),
color = "#7fe3d6",
show.legend = F) +
labs(title = "Property Price (log price per TA) as a function of Distance to Closest Highway",
subtitle = "Properties within 500m of highways",
x = "Distance to Closest Highway (m)",
y = "Property Sale Price (log price per TA)") +
theme_minimal()
scatter_4highways
From this scatter plot, we see that properties within 500m of I-676 and I-76 follow the expected pattern of increasing property prices as one gets further from the highway (folks want to avoid the noise and air pollution from the highway). The correlation between distance to closest highway (in meters) and Property sale price (log-transformed price per total area) is 0.147 and 0.107 for properties near I-676 and I-76, respectively, meaning for every additional meter away from the highway, property price per square foot of total area increases by 15.8% and 11.3%, respectively.
We see a near null effect for property prices and distance to US-1 (correlation of 0.023), but we see a negative relationship between distance to I-95 and property price (correlation of -0.134). This means that for every additional meter away from the highway, property price per square foot of total area decreases by 12.5%.
park_clipped <- st_intersection(park, studyarea)
park_valid <- st_make_valid(park)
transit_valid <- st_make_valid(transit)
city_hall_valid <- st_make_valid(city_hall)
school_valid <- st_make_valid(school)
water_valid <- st_make_valid(water)
bike_network_valid <- st_make_valid(bike_network)
studyarea_buffer <- st_buffer(studyarea, dist = 1000)
# Clip each dataset to the study area
recreation_clipped <- st_intersection(park, studyarea_buffer)
transit_clipped <- st_intersection(transit_valid, studyarea_buffer)
city_hall_clipped <- st_intersection(city_hall_valid, studyarea_buffer)
school_clipped <- st_intersection(school_valid, studyarea_buffer)
water_clipped <- st_intersection(water_valid, studyarea_buffer)
bike_network_clipped <- st_intersection(bike_network_valid, studyarea_buffer)
landuse_transportation <- landuse_rename[landuse_rename$landuse == "Transportation", ]
property_vacant <- property_EDA %>%
filter(category_code_description %in% c("VACANT LAND","SINGLE FAMILY", "VACANT LAND - NON-RESIDENTIAL")) %>%
mutate(category_code_description = case_when(
category_code_description == "VACANT LAND - NON-RESIDENTIAL" ~ "VACANT LAND",
TRUE ~ category_code_description
))
landuse_colors <- c(
"Residential Low" = "#FFFFB3",
"Residential Medium" = "#FFEB8B",
"Residential High" = "#FFCC80",
"Commercial Consumer" = "#FFB3B3",
"Commercial Business/Professional" = "#FF9980",
"Commercial Mixed Residential" = "#FF6666",
"Industrial" = "#D6A3FF",
"Civic/Institution" = "#66B3FF",
"Transportation" = "gray95",
"Culture/Amusement" = "#C1FFC1",
"Active Recreation" = "#66FFD9",
"Park/Open Space" = "#99FF66",
"Cemetery" = "#FF9999",
"Water" = "#99CCFF",
"Vacant" = "#D3D3D3",
"Other/Unknown" = "gray25"
)
BasicGeo_plot1 <- ggplot(data = landuse_rename %>%
filter(!is.na(landuse) & landuse != "Park/Open Space")) +
geom_sf(aes(fill = landuse), color = NA) +
geom_sf(data = park_clipped, aes(fill = "Park"), color = NA, alpha = 1) + # Clipped park layer
scale_fill_manual(values = c(landuse_colors, "Park" = "#99FF66"), na.value = "white") +
labs(title = "Land Use",
subtitle = "Properties functions in Chinatown",
fill = "Category") +
theme_void() +
theme(legend.position = "right",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_text(size = 10),
legend.key.size = unit(0.5, "cm"),
legend.text = element_text(size = 8))
BasicGeo_plot2 <- ggplot() +
geom_sf(data = landuse, fill = "white", color = "gray90", size = 0.05) +
geom_sf(data = recreation_clipped, aes(fill = "Park"), color = NA, alpha = 1) +
geom_sf(data = landuse_transportation, aes(fill = "Transportation"), color = NA, alpha = 0.5) +
geom_sf(data = transit_clipped, aes(color = "Transit"), size = 2) +
geom_sf(data = school_clipped, aes(color = "School"), size = 2) +
geom_sf(data = bike_network_clipped, aes(color = "Bike Network"), size = 1) +
scale_color_manual(values = c("Transit" = "blue", "School" = "orange", "Bike Network" = "purple"), name = NULL) +
scale_fill_manual(values = c("Park" = "green","Transportation" = "gray70"), name = NULL) +
labs(
title = "Amenities",
subtitle = "<span style='color:orange;'>Schools</span>, <span style='color:blue;'>Transit Stations</span>, <span style='color:green;'>Parks</span>, and <span style='color:purple;'>Bike Lanes</span>"
) +
theme_void() +
theme(
legend.position = "none",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = element_markdown(size = 10, hjust = 0),
legend.key.size = unit(0.5, "cm"),
legend.text = element_text(size = 8)
)
BasicGeo_plot3 <- ggplot() +
geom_sf(data = landuse, fill = "white", color = "gray85", size = 0.05) +
geom_sf(data = studyarea, fill = "transparent", color = "grey", linetype = "dashed", linewidth = 2) +
geom_sf(data = discontinuity, fill = "#eb5600", color = "transparent") +
geom_sf(data = Chinatown_Stitch, fill = "#1a9988", alpha = 0.8) +
geom_sf(data = property_vacant, aes(color = category_code_description), size = 1) +
scale_colour_manual(values = c("black", "#B67352")) +
labs(title="Potential Lands",
subtitle = glue("<span style='color:#1a1a1a;'>Single Family & </span><span style='color:#B67352;'>Vacant Land</span>"),
# caption = "Figure "
) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = ggtext::element_markdown(size = 10))
BasicGeo_plot1
BasicGeo_plot2 | BasicGeo_plot3
|
|
|
One of the key deliverables that OTIS wants from us includes an analysis on the effect of I-676 on property prices nearby. Here, we construct a relatively simple linear model with a few key independent variables to look at the effect of I-676 on property prices while accounting for some control variables such as the square footage of the property, number of bathrooms, year built, zoning, and zip code among others.
# keep properties within 500m of I-676
i676_buff <- st_union(st_buffer(philly_highways %>%
filter(ST_RT_NO == "0676"),
buff500))
# add features for simple linear regression
propi676_500 <- prop500[i676_buff,] %>%
# some feature engineering
mutate(residential = ifelse(category_code_description %in% c("APARTMENTS > 4 UNITS",
"MIXED USE",
"MULTI FAMILY",
"SINGLE FAMILY"),
1, 0), # is this property residential? (including mixed)
year_built = as.numeric(year_built),
year_built_group = case_when(
is.na(year_built) | year_built < 1990 ~ "Pre-90s",
year_built >= 1990 & year_built < 2000 ~ "90s",
year_built >= 2000 & year_built < 2010 ~ "00s",
year_built >= 2010 & year_built < 2020 ~ "10s",
year_built >= 2020 ~ "20s"
),
log_dist_i676 = log(dist_to_I676),
central_air = case_when(
central_air %in% c("1", "Y") ~ "Y",
TRUE ~ "N"
),
central_air = factor(central_air),
exterior_condition = case_when(
exterior_condition %in% c("1","2", "3", "4") ~ "AboveAverage",
exterior_condition == "5" ~ "Average",
exterior_condition %in% c("6","8","9") ~ "BelowAverage",
.default = "NAorVacant"
),
exterior_condition = factor(exterior_condition),
interior_condition = case_when(
interior_condition %in% c("1","2","3") ~ "AboveAverage",
interior_condition == "4" ~ "Average",
interior_condition %in% c("5","7") ~ "BelowAverage",
.default = "NAorVacant"
),
interior_condition = factor(interior_condition),
garage_spaces = case_when(
is.na(garage_spaces) | garage_spaces == 0 ~ "0",
garage_spaces == 1 ~ "1",
.default = "2orMore"
),
garage_spaces = factor(garage_spaces),
number_of_bathrooms = case_when(
number_of_bathrooms == 1 ~ "1",
number_of_bathrooms == 2 ~ "2",
number_of_bathrooms > 2 ~ "3orMore",
.default = "0" # NA or 0
),
number_of_bathrooms = factor(number_of_bathrooms),
number_of_bedrooms = case_when(
number_of_bedrooms >= 3 ~ "3orMore",
is.na(number_of_bedrooms) ~ "0",
.default = as.character(number_of_bedrooms)
),
number_of_bedrooms = factor(number_of_bedrooms),
quality_grade_recoded = case_when(
quality_grade %in% c("X", "X-", "X ") ~ "HighestQuality",
quality_grade %in% c("A+", "A", "A ", "A-", "1", "2") ~ "HigherQuality",
quality_grade %in% c("B+", "B", "B ", "B-", "3", "4") ~ "AverageQuality",
quality_grade %in% c("S+", "S", "C+", "5", "6") ~ "LowerQuality",
.default = "LowestQuality" # all lower grades + missing data
),
quality_grade_recoded = factor(quality_grade_recoded),
separate_utilities = factor(separate_utilities),
building_code_description_new = case_when(
grepl("GYM", building_code_description_new) ~ "GYM",
grepl("^WARE", building_code_description_new) ~ "WAREHOUSE",
grepl("COLLEGE|SCHOOL", building_code_description_new) ~ "SCHOOL/COLLEGE/UNI",
grepl("ROW MIXED", building_code_description_new) ~ "ROWHOME MIXED",
grepl("TWIN MIXED", building_code_description_new) ~ "TWIN MIXED",
grepl("^ROW|^TWIN|CONDO|AS RESID", building_code_description_new) ~ "ROW/TWIN/CONDO", # apt blt as resid
grepl("BOARDING|SNGL", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("MEDICAL", building_code_description_new) ~ "MEDICAL OFFICE",
grepl("FRAT", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
grepl("LOW RISE", building_code_description_new, ignore.case = T) ~ "LOW RISE BLDG",
grepl("MID RISE", building_code_description_new) ~ "MID RISE APT",
grepl("HIGH RISE", building_code_description_new) ~ "HIGH RISE APT",
grepl("PARKING", building_code_description_new) ~ "PARKING",
grepl("BAR/", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("LEGIT", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("DIALYSI", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("OFFICE WA", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("CONV FOOD", building_code_description_new) ~ building_code_description_new, # unchanged names
grepl("MANUFACTU", building_code_description_new) ~ building_code_description_new, # unchanged names
.default = "OTHER" # including missing
),
building_code_description_new = factor(building_code_description_new),
zoning_group = case_when(
zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "Nghbrhd Commercial Mixed",
zoning %in% c("CMX3", "CMX4") ~ "Cmmnty Commercial Mixed",
zoning == "CMX5" ~ "CC Commercial Mixed",
zoning == "RM1" ~ "Resid MultiFam-Big",
zoning == "RM4" ~ "Resid MultiFam-Small",
zoning == "RMX3" ~ "Resid Mixed",
grepl("RSA", zoning) ~ "Resid SnglFam Attached",
zoning == "RTA1" ~ "Resid 2Fam Attached",
zoning == "IRMX" ~ "Indstrl Resid Mixed",
.default = "Other" # including missing
)
) %>%
select( # only keep vars necessary for model
log_price,
dist_to_I676m, log_dist_i676,
residential,
year_built,
central_air, garage_spaces, separate_utilities,
number_of_bathrooms, number_of_bedrooms, total_area,
exterior_condition, interior_condition, quality_grade_recoded,
building_code_description_new, zoning_group,
zip_code
)
# need to check which of residential, building_code_description_new, or zoning is the best in this model
# also need to check if turning some of the numeric variables into categorical is better (year_built may benefit from this)
# map to visualize
# ggplot() +
# geom_sf(data = philly_bounds, fill = "#c9b887", color = "grey") +
# geom_sf(data = hydro, fill = "#7d97c7", color = "transparent") +
# geom_sf(data = philly_parks, fill = "#6b9169", color = "transparent") +
# geom_sf(data = i676_buff,
# color = "transparent", fill = "grey", alpha = 0.4) +
# geom_sf(data = propi676_500, size = 0.3, color = "#7fe3d6",
# show.legend = F) +
# geom_sf(data = philly_highways %>%
# filter(ST_RT_NO == "0676"),
# color = "black") +
# labs(title = "Properties within 500m of I-676") +
# theme_void()
# Numeric variables
# number_of_rooms was previously included but all but 10 values are NA
# propi676_500_numvars <- propi676_500 %>%
# select(log_price, dist_to_I676m,
# # depth, fireplaces, frontage, garage_spaces, number_stories, # ended up excluding
# number_of_bathrooms,
# number_of_bedrooms, total_area, year_built) %>%
# mutate(year_built = as.numeric(year_built)) %>%
# st_drop_geometry()
#
# cors_logprice <- cor(propi676_500_numvars,
# use = "pairwise.complete.obs")
#
# # significance test
# # sigtest_logprice <- cor.mtest(propi676_500_numvars)
#
# # pretty correlation matrix of all continuous variables
# corrplot(cors_logprice,
# title = "Correlation Matrix for Log Price",
# # p.mat = sigtest_logprice$p,
# method = 'circle',
# type = 'lower',
# insig='blank',
# addCoef.col ='black',
# number.cex = 0.8,
# diag=FALSE)
# Based off of this correlation matrix, I will move forward with number_of_bathrooms (instead of number_of_bedrooms) since it has a higher correlation with log_price
# total_area will be included (and as a result, I won't keep frontage, depth, and number_of_stories because of multicollinearity)
# Categorical variables
## Basements
# not including because there isn't enough of a difference in log price across categories
# unique(propi676_500$basements)
# table(propi676_500$basements)
# summary(propi676_500$basements)
#
# temp <- propi676_500 %>%
# mutate(basement_finish = case_when(basements %in% c("A", "B", "E", "F") ~ "Finished", # including finished and semi-finished
# basements %in% c("C", "G", "J") ~ "Unfinished",
# basements == "0" ~ "No Basement",
# .default = "Unknown Finish"),
# basement_size = case_when(basements %in% c("A", "B", "C", "D") ~ "Full", # including finished and semi-finished
# basements %in% c("E", "F", "G", "H") ~ "Partial",
# basements == "0" ~ "No Basement",
# .default = "Unknown Size"))
#
# ggplot(data = temp %>%
# group_by(basement_finish) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = basement_finish, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) for Different Basement Finishes",
# x = "Basement Finishes",
# y = "Mean Price")
#
# ggplot(data = temp %>%
# group_by(basement_size) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = basement_size, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) for Different Basement Size",
# x = "Basement Size",
# y = "Mean Price")
## Category Code
# only including whether a property is residential or not
# unique(propi676_500$category_code)
# table(propi676_500$category_code)
# summary(propi676_500$category_code)
#
# temp <- propi676_500 %>%
# mutate(category_simple = case_when(category_code_description %in% c("APARTMENTS > 4 UNITS",
# "MIXED USE",
# "MULTI FAMILY",
# "SINGLE FAMILY") ~ "Residential or Mixed",
# grepl("INDUS", category_code_description) ~ "Industrial",
# grepl("HOTEL", category_code_description) ~ "Hotel",
# grepl("OFFICES", category_code_description) ~ "Offices",
# category_code_description %in% c("COMMERCIAL",
# "GARAGE - COMMERCIAL",
# "RETAIL") ~ "Commercial",
# grepl("VACANT", category_code_description) ~ "Vacant",
# .default = "Other"),
# residential = ifelse(category_simple == "Residential or Mixed", 1, 0))
# ggplot(data = temp %>%
# group_by(category_simple) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = category_simple, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Category",
# x = "Property Category",
# y = "Mean Price")
# ggplot(data = temp %>%
# group_by(residential) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = residential, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Category (Residential or not)",
# x = "Property Category",
# y = "Mean Price")
## Central Air
# not including because there isn't enough of a difference in log price across categories
# unique(propi676_500$central_air)
# table(propi676_500$central_air)
# summary(propi676_500$central_air)
#
# temp <- propi676_500 %>%
# mutate(central_air_new = case_when(central_air == "Y" ~ central_air,
# .default = "N or unknown"))
#
# ggplot(data = temp %>%
# group_by(central_air_new) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price"),
# aes(x = central_air_new, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# labs(title = "Mean Property Price (raw and log-transformed) by Central Air",
# x = "Central Air",
# y = "Mean Price")
# parcel_shape
# three categories (A,B,E) seem to be good enough
# propi676_500 %>%
# select(parcel_shape, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(parcel_shape) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = parcel_shape, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# separate_utilities
# four categories (A,B,C,NA) seem to be good enough
# propi676_500 %>%
# select(separate_utilities, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(separate_utilities) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = separate_utilities, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# fireplaces
# not a huge factor, so not including in model for now
# but if i were to use this, i think i would just group this into a binary indicating the presence of a fireplace on a property
# unique(propi676_500$fireplaces)
# table(propi676_500$fireplaces)
# summary(propi676_500$fireplaces)
#
# propi676_500 %>%
# select(fireplaces, adjusted_sale_price, log_price) %>%
# st_drop_geometry() %>%
# group_by(fireplaces) %>%
# summarize(mean_price = mean(adjusted_sale_price),
# mean_logprice = mean(log_price)) %>%
# ungroup() %>%
# pivot_longer(cols = 2:3, names_to = "price_type", values_to = "mean_price") %>%
# ggplot(aes(x = fireplaces, y = mean_price)) +
# geom_bar(stat = "identity", fill = "#1a9988", alpha = 0.8, width = 0.3) +
# facet_wrap(~price_type, scales = "free") +
# theme_minimal() +
# theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
# i realized halfway through doing this that I could use what Sijia started with
# based off of the feature engineering that Sijia did
# property_CT <- property_final %>%
# mutate(
# year_built = as.numeric(year_built),
# year_built_group = case_when(
# # year_built < 1990 ~ "Before 1990",
# # year_built >= 1990 & year_built < 2000 ~ "1990-1999",
# # year_built >= 2000 & year_built < 2020 ~ "2000s-2020s",
# # year_built >= 2020 ~ "2020s+"
# is.na(year_built) | year_built < 1990 ~ "Pre-90s",
# year_built >= 1990 & year_built < 2000 ~ "90s",
# year_built >= 2000 & year_built < 2010 ~ "00s",
# year_built >= 2010 & year_built < 2020 ~ "10s",
# year_built >= 2020 ~ "20s"
# ),
#
# # log_price = log(adj_sale_price),
# log_dist_highway = log(dist_to_I676),
#
# central_air = case_when(
# central_air %in% c("1", "Y") ~ "Y",
# TRUE ~ "N"
# ),
# central_air = factor(central_air),
#
# exterior_condition = case_when(
# # is.na(exterior_condition) | exterior_condition == 1 ~ "1",
# # exterior_condition == 3 ~ "2",
# # exterior_condition %in% c(2, 4, 5) ~ "3",
# exterior_condition %in% c("1","2", "3", "4") ~ "AboveAverage",
# exterior_condition == "5" ~ "Average",
# exterior_condition %in% c("6","8","9") ~ "BelowAverage",
# .default = "NAorVacant")
# ),
# exterior_condition = factor(exterior_condition),
#
# interior_condition = case_when(
# # interior_condition %in% c(0, 1, 2, 3) ~ "1",
# # interior_condition %in% c(4, NA) ~ "2",
# # TRUE ~ "3"
# interior_condition %in% c("1","2","3") ~ "AboveAverage",
# interior_condition == "4" ~ "Average",
# interior_condition %in% c("5","7") ~ "BelowAverage",
# .default = "NAorVacant"
# ),
# interior_condition = factor(interior_condition),
#
# garage_spaces = case_when(
# is.na(garage_spaces) | garage_spaces == 0 ~ "0",
# garage_spaces == 1 ~ "1",
# # TRUE ~ "2"
# .default = "2orMore"
# ),
# garage_spaces = factor(garage_spaces),
#
# # general_construction = case_when(
# # general_construction %in% c("A ", "B ", "D ") ~ "A",
# # general_construction == "3 " ~ "C",
# # TRUE ~ "B"
# # ),
# # general_construction = factor(general_construction),
#
# number_of_bathrooms = case_when(
# number_of_bathrooms == 1 ~ "1",
# # number_of_bathrooms %in% c(2, NA) ~ "2",
# # number_of_bathrooms == 3 ~ "3",
# # TRUE ~ "4"
# number_of_bathrooms == 2 ~ "2",
# number_of_bathrooms > 2 ~ "3orMore",
# .default = "0" # NA or 0
# ),
# number_of_bathrooms = factor(number_of_bathrooms),
#
# number_of_bedrooms = case_when(
# # number_of_bedrooms %in% c(0, 12) | is.na(number_of_bedrooms) ~ "0",
# # number_of_bedrooms %in% c(1, 2) ~ "1",
# # number_of_bedrooms == 3 ~ "2",
# # number_of_bedrooms == 4 ~ "3",
# # number_of_bedrooms %in% c(5, 6) ~ "4",
# # number_of_bedrooms %in% c(7, 8, 9) ~ "5"
# number_of_bedrooms >= 3 ~ "3orMore",
# is.na(number_of_bedrooms) ~ "0",
# .default = as.character(number_of_bedrooms)
# ),
# number_of_bedrooms = factor(number_of_bedrooms),
#
# quality_grade_recoded = case_when(
# # quality_grade %in% c("A+", "A") ~ "1",
# # quality_grade %in% c("A-","3", "7") ~ "2",
# # TRUE ~ "3"
# quality_grade %in% c("X", "X-", "X ") ~ "HighestQuality",
# quality_grade %in% c("A+", "A", "A ", "A-", "1", "2") ~ "HigherQuality",
# quality_grade %in% c("B+", "B", "B ", "B-", "3", "4") ~ "AverageQuality",
# quality_grade %in% c("S+", "S", "C+", "5", "6") ~ "LowerQuality",
# .default = "LowestQuality" # all lower grades + missing data
# ),
# quality_grade_recoded = factor(quality_grade_recoded),
#
# # separate_utilities = case_when(
# # separate_utilities %in% c("A", "C") ~ "B",
# # TRUE ~ "A"
# # ),
# # separate_utilities = factor(separate_utilities),
#
# building_code_description_new = case_when(
# # building_code_description_new %in% c("COLONIAL", "OLD STYLE", "ROW MODERN", "ROW OLD STYLE", "TUDOR", "TWIN BUNGALOW") ~ "1",
# # building_code_description_new %in% c("ROW POST WAR", "ROW TYPICAL", "TWIN CONVENTIONAL") ~ "3",
# # building_code_description_new %in% c("OTHER", "ROW PORCH FRONT") ~ "4",
# # TRUE ~ "2"
# grepl("GYM", building_code_description_new) ~ "GYM",
# grepl("^WARE", building_code_description_new) ~ "WAREHOUSE",
# grepl("COLLEGE|SCHOOL", building_code_description_new) ~ "SCHOOL/COLLEGE/UNI",
# grepl("ROW MIXED", building_code_description_new) ~ "ROWHOME MIXED",
# grepl("TWIN MIXED", building_code_description_new) ~ "TWIN MIXED",
# grepl("^ROW|^TWIN|CONDO|AS RESID", building_code_description_new) ~ "ROW/TWIN/CONDO", # apt blt as resid
# grepl("BOARDING|SNGL", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
# grepl("MEDICAL", building_code_description_new) ~ "MEDICAL OFFICE",
# grepl("FRAT", building_code_description_new) ~ "BOARDING/ROOMING/SNGL OCC",
# grepl("LOW RISE", building_code_description_new, ignore.case = T) ~ "LOW RISE BLDG",
# grepl("MID RISE", building_code_description_new) ~ "MID RISE APT",
# grepl("HIGH RISE", building_code_description_new) ~ "HIGH RISE APT",
# grepl("PARKING", building_code_description_new) ~ "PARKING",
# grepl("BAR/", building_code_description_new) ~ building_code_description_new, # unchanged names
# grepl("LEGIT", building_code_description_new) ~ building_code_description_new, # unchanged names
# grepl("DIALYSI", building_code_description_new) ~ building_code_description_new, # unchanged names
# grepl("OFFICE WA", building_code_description_new) ~ building_code_description_new, # unchanged names
# grepl("CONV FOOD", building_code_description_new) ~ building_code_description_new, # unchanged names
# grepl("MANUFACTU", building_code_description_new) ~ building_code_description_new, # unchanged names
# .default = "OTHER" # including missing
# ),
# building_code_description_new = factor(building_code_description_new),
#
# zoning_group = case_when(
# # zoning %in% c("CMX5") ~ "CMX5",
# # zoning %in% c("IRMX") ~ "IRMX",
# # zoning %in% c("RM1","RM2", "RM4") ~ "RM",
# # zoning %in% c("RMX3") ~ "RMX3",
# # zoning %in% c("RSA5") ~ "RSA5",
# # zoning %in% c("ICMX") ~ "ICMX",
# # zoning %in% c("SPPOA") ~ "SPPOA",
# # zoning %in% c("I2") ~ "I2",
# # zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "CMX(1,2,2.5)",
# # zoning %in% c("CMX3", "CMX4") ~ "CMX(3,4)",
# zoning %in% c("CMX2.5", "CMX2", "CMX1") ~ "Nghbrhd Commercial Mixed",
# zoning %in% c("CMX3", "CMX4") ~ "Cmmnty Commercial Mixed",
# zoning == "CMX5" ~ "CC Commercial Mixed",
# zoning == "RM1" ~ "Resid MultiFam-Big",
# zoning == "RM4" ~ "Resid MultiFam-Small",
# zoning == "RMX3" ~ "Resid Mixed",
# grepl("RSA", zoning) ~ "Resid SnglFam Attached",
# zoning == "RTA1" ~ "Resid 2Fam Attached",
# zoning == "IRMX" ~ "Indstrl Resid Mixed",
# .default = "Other" # including missing
# ),
# )
getRMSE <- function(model_results) {
# residual sum of squares
rss <- c(crossprod(model_results$residuals))
# Mean squared error:
mse <- rss / length(model_results$residuals)
# Root MSE:
RMSE <- sqrt(mse)
return(RMSE)
}
# model with just distance to highway
model_i676 <- lm(log_price ~ dist_to_I676m,
data = propi676_500)
model_i676_summary <- summary(model_i676)
model_i676_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m, data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9761 -0.5219 -0.1135 0.4409 3.0335
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.3743509 0.1260092 114.074 < 0.0000000000000002 ***
## dist_to_I676m -0.0019135 0.0003805 -5.029 0.000000789 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9929 on 351 degrees of freedom
## Multiple R-squared: 0.0672, Adjusted R-squared: 0.06455
## F-statistic: 25.29 on 1 and 351 DF, p-value: 0.0000007891
model_i676_RMSE <- getRMSE(model_i676)
# model with just distance to highway + zip code
model_i676_zip <- lm(log_price ~ dist_to_I676m + zip_code,
data = propi676_500)
model_i676_zip_summary <- summary(model_i676_zip)
model_i676_zip_RMSE <- getRMSE(model_i676_zip)
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade_recoded +
factor(zip_code), # neighborhood effects (via zip code for now)
data = propi676_500)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
From a overly simple model, just looking at how distance to I676 (m) predicts property price (log transformed), we see that for properties within 500m of I676, distance from the highway does not have a large effect on property price. We see that for every additional meter away from I676, property price decreases by 0.2% (basically a null effect). The R squared value of the model is very low at 0.067 and the RMSE is very high 0.99, meaning the model is terrible.
# model with just distance to highway + zip code
model_i676_zip <- lm(log_price ~ dist_to_I676m + zip_code,
data = propi676_500)
model_i676_zip_summary <- summary(model_i676_zip)
model_i676_zip_RMSE <- getRMSE(model_i676_zip)
model_i676_zip_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + zip_code, data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4559 -0.5364 -0.0900 0.3883 3.2469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.286503 0.380829 40.140 < 0.0000000000000002 ***
## dist_to_I676m -0.001763 0.000441 -3.998 0.000078 ***
## zip_code19103 -0.870646 0.391550 -2.224 0.026823 *
## zip_code19104 -1.496935 0.414420 -3.612 0.000349 ***
## zip_code19106 -0.969805 0.395024 -2.455 0.014579 *
## zip_code19107 -1.445813 0.403189 -3.586 0.000384 ***
## zip_code19123 -0.315026 0.401933 -0.784 0.433708
## zip_code19130 -1.020486 0.438004 -2.330 0.020390 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9194 on 345 degrees of freedom
## Multiple R-squared: 0.2138, Adjusted R-squared: 0.1979
## F-statistic: 13.41 on 7 and 345 DF, p-value: 0.00000000000000254
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade_recoded +
factor(zip_code), # neighborhood effects (via zip code for now)
data = propi676_500)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
When we add ZIP code to this simple model, the adjusted R squared increases to 0.198 and RMSE decreases to 0.909, indicating that we have a slightly better, but still bad model. In this ZIP + distance to I676 model, keeping all else constant property price still decreases only by 0.2% per every additional meter away from the expressway.
# simple model but with more predictors
model_i676_more <- lm(log_price ~ dist_to_I676m +
residential + # internal characteristics
year_built + central_air +
number_of_bathrooms + total_area +
quality_grade_recoded +
factor(zip_code), # neighborhood effects (via zip code for now)
data = propi676_500)
model_i676_more_summary <- summary(model_i676_more)
model_i676_more_RMSE <- getRMSE(model_i676_more)
model_i676_more_summary
##
## Call:
## lm(formula = log_price ~ dist_to_I676m + residential + year_built +
## central_air + number_of_bathrooms + total_area + quality_grade_recoded +
## factor(zip_code), data = propi676_500)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42503 -0.30862 -0.00947 0.30665 2.61892
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 14.413422595 0.499280750 28.868
## dist_to_I676m -0.000484338 0.000361651 -1.339
## residential -0.650732451 0.152974130 -4.254
## year_built 0.000168566 0.000202783 0.831
## central_airY 0.039468084 0.128035661 0.308
## number_of_bathrooms1 -0.568288121 0.137764180 -4.125
## number_of_bathrooms2 -0.112000409 0.144022500 -0.778
## number_of_bathrooms3orMore 0.197076160 0.149153688 1.321
## total_area 0.000051908 0.000005649 9.189
## quality_grade_recodedHigherQuality 0.242072835 0.201031136 1.204
## quality_grade_recodedHighestQuality 1.262089310 0.490235277 2.574
## quality_grade_recodedLowerQuality -0.038399167 0.126012426 -0.305
## quality_grade_recodedLowestQuality 0.212226594 0.094495872 2.246
## factor(zip_code)19103 -0.215029069 0.299579671 -0.718
## factor(zip_code)19104 -0.986370091 0.313695835 -3.144
## factor(zip_code)19106 -0.436644503 0.301382009 -1.449
## factor(zip_code)19107 -0.565890700 0.308259210 -1.836
## factor(zip_code)19123 -0.330145337 0.307755414 -1.073
## factor(zip_code)19130 -0.400066655 0.337101622 -1.187
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## dist_to_I676m 0.18147
## residential 0.0000278 ***
## year_built 0.40646
## central_airY 0.75809
## number_of_bathrooms1 0.0000476 ***
## number_of_bathrooms2 0.43736
## number_of_bathrooms3orMore 0.18737
## total_area < 0.0000000000000002 ***
## quality_grade_recodedHigherQuality 0.22944
## quality_grade_recodedHighestQuality 0.01050 *
## quality_grade_recodedLowerQuality 0.76078
## quality_grade_recodedLowestQuality 0.02541 *
## factor(zip_code)19103 0.47344
## factor(zip_code)19104 0.00183 **
## factor(zip_code)19106 0.14840
## factor(zip_code)19107 0.06735 .
## factor(zip_code)19123 0.28421
## factor(zip_code)19130 0.23622
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6759 on 311 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.5654, Adjusted R-squared: 0.5402
## F-statistic: 22.47 on 18 and 311 DF, p-value: < 0.00000000000000022
Finally, we look at a more robust model (which is still relatively simple, including only internal features of the property in addition to ZIP code and distance to I676). This model includes the following predictors: year_built (categorical), quality_grade (categorical), number_of_bathrooms (categorical), total_area (continuous), central_air (binary), and a variable dictating whether the property is residential or not (binary). This more complex model does a better job at explaining the variance in property price than our previous two models, but is not optimal either. The adjusted R squared for this model is 0.54 and our RMSE is 0.656. When including these other predictors, the effect of distance to highway is not only much smaller than before, but is now also not statistically significant.
We move forward with our analyses with the hypothesis that although distance to I676 does not play as much of a role in predicting property price, the Chinatown Stitch project will create an open green space that we can conceptualize as a park. We predict that the Chinatown Stitch cap as a park will have a statistically significant effect on property prices nearby.
property_discontinutiy <- property_EDA_update %>%
mutate(distance_to_highway = case_when(
I676 == "north" ~ distance_to_I676,
I676 == "south" ~ -distance_to_I676
))
discontinuity_plot <- property_discontinutiy %>%
st_drop_geometry() %>%
filter(sale_price > 10 & sale_price < 1e6) %>%
group_by(distance_to_highway, I676) %>%
summarise(log_sale_price = mean(log_sale_price, na.rm = TRUE), .groups = "drop")
ggplot(discontinuity_plot, aes(x = distance_to_highway, y = log_sale_price, color = I676)) +
geom_point(size = 0.5) +
geom_smooth(method = "lm", se = FALSE, size = 1.2) +
geom_vline(xintercept = 0, color = "black", size = 2) +
scale_colour_manual(values = c("#eb5600", "#1a9988")) +
theme_minimal() +
theme(legend.position = "None",
plot.title = element_text(size = 16, face = "bold"),
plot.subtitle = ggtext::element_markdown(size = 12),
plot.caption = element_text(hjust = 0)) +
labs(title = 'Logged Sale Price as a Function of Distance to the Highway',
subtitle = glue("<span style='color:#1a9988;'>South Side</span> vs. <span style='color:#eb5600;'>North Side</span>"),
# caption = "Figure 4.1",
# x = "South Side of the Highway I-676 North Side of the Highway",
x = "Distance to the Highyway (ft) ",
y = "Log-transformed Sale Price") +
geom_label(aes(x = 0, y = 8,
label = "I-676"),
fill = "black", color = "white", fontface = "bold", size = 5)
2.4 Social Survey